import jsonlines

from datasets import load_dataset
from tqdm import tqdm
import jsonlines

datas = []
finish_datas = set()
with jsonlines.open("helpsteer_label.jsonl") as f:
    for line in tqdm(f):
        if len(line["label"]["ability"])==0:
            continue
        else:
            finish_datas.add(line["response"][:100])

datas = []
with jsonlines.open("/share/project/lijijie/tools/instruction_follow/dpo_data/helpsteer2/train.jsonl-2") as f:
    for line in f:
        if line["response"][:100] not in finish_datas:
            datas.append(line)
        #datas.add(line)
        #    contexts.add(line["chosen"][0]["content"])
        #if line["chosen"][0]["content"] not in contexts:
        #    datas.append(line)

#datas = []
#for d in raw_dataset:
#    if d["chosen"][0]["content"] not in contexts:
#        datas.append(d)

with jsonlines.open("helpsteer_label_filter.jsonl","w") as wf:
    for line in datas:
        wf.write(line)



